# set seed
set.seed(100)
# create initial split
splitted <- initial_split(data_clean, prop = 0.8, strata = "attrition")
# quick check
splitted#> <1177/293/1470>
# define preprocess recipe from train dataset
rec <- recipe(attrition ~ ., data = training(splitted)) %>%
step_rm(employee_count, employee_number) %>%
step_nzv(all_predictors()) %>%
step_string2factor(all_nominal(), -attrition) %>%
step_string2factor(attrition, levels = c("yes", "no")) %>%
step_downsample(attrition, ratio = 1/1, seed = 100) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
prep(strings_as_factors = FALSE)
# get train and test dataset
data_train <- juice(rec)
data_test <- bake(rec, testing(splitted))
# quick check
head(data_train, 10)# define model specification
model_engine <- rand_forest(mode = "classification")
# define model engine
model_engine <- set_engine(
object = model_engine,
engine = "ranger",
seed = 100,
num.threads = parallel::detectCores() / 2,
importance = "impurity"
)
# quick check
model_engine#> Random Forest Model Specification (classification)
#>
#> Engine-Specific Arguments:
#> seed = 100
#> num.threads = parallel::detectCores()/2
#> importance = impurity
#>
#> Computational engine: ranger
# set-up model grid
model_grid <- grid_regular(
range_set(mtry, range = c(2, ncol(data_train) - 2)),
range_set(trees, range = c(500, 1500)),
range_set(min_n, range = c(1, 30)),
levels = 3
)
# quick check
model_grid# merge model engine and grid
model_specs <- tibble(spec = merge(model_engine, model_grid)) %>%
mutate(spec_id = str_pad(row_number(), width = 2, side = "left", pad = "0"))
# give every spec in model grid an id
model_grid <- model_grid %>%
mutate(spec_id = str_pad(row_number(), width = 2, side = "left", pad = "0"))
# cross cv splits and model specs
crossed <- crossing(cv_split, model_specs)
# fit on every folds
crossed <- crossed %>%
mutate(model = map2(spec, splits, ~
fit_xy(
object = .x,
x = select(analysis(.y), -attrition),
y = select(analysis(.y), attrition))
))
# quick check
head(crossed, 10)# get hold-out performance in every folds
cv_results <- crossed %>%
mutate(performance = map2(model, splits, ~
assessment(.y) %>%
bind_cols(predict(.x, .)) %>%
summarise(
sensitivity = sens_vec(attrition, .pred_class),
precision = precision_vec(attrition, .pred_class)
)
))
# unnest the cv result
cv_results <- cv_results %>%
select(spec_id, id, id2, performance) %>%
unnest(performance)
# join with model grid
cv_results <- cv_results %>%
left_join(model_grid)
# quick check
head(cv_results, 10)# get best model by sensitivity
best_model <- cv_results %>%
group_by(spec_id) %>%
summarise(
mean = mean(sensitivity),
sd = sd(sensitivity)
) %>%
ungroup() %>%
mutate(
score_mean = round((mean - min(mean)) / (max(mean) - min(mean)), 4),
score_sd = 1 - round((sd - min(sd)) / (max(sd) - min(sd)), 4),
score_total = 0.5 * score_mean + 0.5 * score_sd
) %>%
arrange(desc(score_total)) %>%
slice(1)
# check best model specification
left_join(best_model, model_grid)# get best model specification
best_model_spec <- best_model %>%
left_join(model_specs) %>%
pull(spec) %>%
pluck(1)
# fit the model
model <- fit_xy(
object = best_model_spec,
x = select(data_train, -attrition),
y = select(data_train, attrition)
)
# quick check
model#> parsnip model object
#>
#> Ranger result
#>
#> Call:
#> ranger::ranger(formula = formula, data = data, mtry = ~29L, num.trees = ~500L, min.node.size = ~15L, seed = ~100, num.threads = ~parallel::detectCores()/2, importance = ~"impurity", verbose = FALSE, probability = TRUE)
#>
#> Type: Probability estimation
#> Number of trees: 500
#> Sample size: 380
#> Number of independent variables: 30
#> Mtry: 29
#> Target node size: 15
#> Variable importance mode: impurity
#> Splitrule: gini
#> OOB prediction error (Brier s.): 0.1942027
# get variable importance
var_imp <- tidy(model$fit$variable.importance)
# tidying
var_imp <- var_imp %>%
head(10) %>%
rename(variable = names, importance = x) %>%
mutate(variable = reorder(variable, importance))
# variable importance plot
ggplot(var_imp, aes(x = variable, y = importance)) +
geom_col(fill = "darkblue") +
coord_flip() +
labs(title = "Variables Importance (Top 10)", x = NULL, y = NULL, fill = NULL) +
scale_y_continuous(expand = expand_scale(mult = c(0, 0.1))) +
theme_minimal()# predict on test
pred_test <- select(data_test, attrition) %>%
bind_cols(predict(model, select(data_test, -attrition))) %>%
bind_cols(predict(model, select(data_test, -attrition), type = "prob"))
# quick check
head(pred_test, 10)# metrics summary
pred_test %>%
summarise(
accuracy = accuracy_vec(attrition, .pred_class),
sensitivity = sens_vec(attrition, .pred_class),
specificity = spec_vec(attrition, .pred_class),
precision = precision_vec(attrition, .pred_class)
)# get roc curve data on test dataset
pred_test_roc <- pred_test %>%
roc_curve(attrition, .pred_yes)
# tidying
pred_test_roc <- pred_test_roc %>%
mutate_if(~ is.numeric(.), ~ round(., 4)) %>%
gather(metric, value, -.threshold)
# plot sensitivity-specificity trade-off
p <- ggplot(pred_test_roc, aes(x = .threshold, y = value)) +
geom_line(aes(colour = metric)) +
labs(x = "Probability Threshold to be Classified as Positive", y = "Value", colour = "Metrics") +
theme_minimal()
ggplotly(p)# get pr curve data on test dataset
pred_test_pr <- pred_test %>%
pr_curve(attrition, .pred_yes)
# tidying
pred_test_pr <- pred_test_pr %>%
mutate_if(~ is.numeric(.), ~ round(., 4)) %>%
gather(metric, value, -.threshold)
# plot recall-precision trade-off
p <- ggplot(pred_test_pr, aes(x = .threshold, y = value)) +
geom_line(aes(colour = metric)) +
labs(x = "Probability Threshold to be Classified as Positive", y = "Value", colour = "Metrics") +
theme_minimal()
ggplotly(p)